TV Script Generation
Table of Contents
What is this about?
In this project, you'll generate your own Seinfeld TV scripts using RNNs. You'll be using part of the Seinfeld dataset of scripts from 9 seasons. The Neural Network you'll build will generate a new ,"fake" TV script, based on patterns it recognizes in this training data.
Set Up
Imports
Python
from collections import Counter
from pathlib import Path
import os
PyPi
Support Code
import udacity.project_tv_script_generation.problem_unittests as unittests
Load Dotenv
load_dotenv()
Get the Data
class Scripts:
"""Seinfeld Scripts
Args:
environment_key: environment variable with the source location
"""
def __init__(self, environment_key: str="SCRIPTS") -> None:
self.environment_key = environment_key
self._script_blob = None
self._path = None
self._lines = None
self._tokens = None
return
@property
def path(self) -> Path:
"""The path to the file"""
if self._path is None:
load_dotenv()
self._path = Path(os.environ.get("SCRIPTS")).expanduser()
assert self._path.is_file()
return self._path
@property
def script_blob(self) -> str:
"""The input file as a string"""
if self._script_blob is None:
with open(self.path) as reader:
self._script_blob = reader.read()
return self._script_blob
@property
def lines(self) -> list:
"""The lines of the script"""
if self._lines is None:
self._lines = self.script_blob.split("\n")
return self._lines
@property
def tokens(self) -> Counter:
"""The tokens and their counts"""
if self._tokens is None:
self._tokens = Counter()
for token in self.script_blob.split():
self._tokens[token] += 1
return self._tokens
class ScriptInspector:
"""gets some basic counts
Args:
scripts: object with the scripts
"""
def __init__(self, scripts: Scripts=None) -> None:
self._scripts = scripts
self._line_count = None
self._count_per_line = None
self._mean_words_per_line = None
self._token_count = None
return
@property
def scripts(self) -> Scripts:
"""The scripts object"""
if self._scripts is None:
self._scripts = Scripts()
return self._scripts
@property
def line_count(self) -> int:
"""Number of lines in the source"""
if self._line_count is None:
self._line_count = len(self.scripts.lines)
return self._line_count
@property
def count_per_line(self) -> list:
"""tokens per line"""
if self._count_per_line is None:
self._count_per_line = [len(line.split(" "))
for line in self.scripts.lines]
return self._count_per_line
@property
def mean_words_per_line(self) -> float:
"""Average number of words per line"""
if self._mean_words_per_line is None:
self._mean_words_per_line = (sum(self.count_per_line)
/self.line_count)
return self._mean_words_per_line
@property
def token_count(self) -> int:
"""Number of tokens in the text"""
if self._token_count is None:
self._token_count = sum(self.scripts.tokens.values())
return self._token_count
def most_common_tokens(self, count: int=10) -> list:
"""token, count tuples in descending rank
Args:
count: number of tuples to return in the list
"""
return self.scripts.tokens.most_common(count)
def line_range(self, start: int=0, stop: int=10) -> list:
"""lines within range
Args:
start: index of first line
stop: upper bound for last line
"""
return self.scripts.lines[start:stop]
The scripts aren't really in a format that is optimized for pandas, at least not for this initial look, so we'll just load it as text.
inspector = ScriptInspector()
Explore the Data
Note that the first line is a header, but we're ignoring that and including it with the counts. So this is very rough.
view_line_range = (0, 10)
Dataset Statistics
lines = (("Number of unique tokens", "{:,}".format(inspector.token_count)),
("Number of lines", "{:,}".format(inspector.line_count)),
("Average number of words in each line", "{:.2f}".format(
inspector.mean_words_per_line)))
print(tabulate(lines, headers="Statistic Value".split(), tablefmt="orgtbl"))
| Statistic | Value |
|---|---|
| Number of unique tokens | 550,996 |
| Number of lines | 54,618 |
| Average number of words in each line | 10.09 |
Top Ten Words
lines = ((token, "{:,}".format(count))
for token, count in inspector.most_common_tokens())
print(tabulate(lines,
tablefmt="orgtbl", headers=["Token", "Count"]))
| Token | Count |
|---|---|
| the | 16,373 |
| I | 13,911 |
| you | 12,831 |
| a | 12,096 |
| to | 11,594 |
| of | 5,490 |
| and | 5,210 |
| in | 4,741 |
| is | 4,283 |
| that | 4,047 |
So it looks like the stop words are the most common, as you might expect.
The First five Lines
for line in inspector.line_range(stop=5):
print(line)
,Character,Dialogue,EpisodeNo,SEID,Season 0,JERRY,"Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are. (on an imaginary phone) Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Where ever you are in life, its my feeling, youve gotta go.",1.0,S01E01,1.0 1,JERRY,"(pointing at Georges shirt) See, to me, that button is in the worst possible spot. The second button literally makes or breaks the shirt, look at it. Its too high! Its in no-mans-land. You look like you live with your mother.",1.0,S01E01,1.0 2,GEORGE,Are you through?,1.0,S01E01,1.0 3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0
As you can see it is a comma-separated file with a header. What's not so obvious is how the index works. Is it for all the lines? Since the episode number is in the row-data I would assume so.
Pre-Processing the Text
The first thing to do to any dataset is pre-processing. Implement the following pre-processing functions below:
- Lookup Table
- Tokenize Punctuation
Lookup Table
To create a word embedding, you first need to transform the words to ids. In this function, create two dictionaries:
- Dictionary to go from the words to an id, we'll call
vocab_to_int - Dictionary to go from the id to word, we'll call
int_to_vocab
Return these dictionaries in the following tuple (vocab_to_int, int_to_vocab)
import problem_unittests as tests
def create_lookup_tables(text): """ Create lookup tables for vocabulary :param text: The text of tv scripts split into words :return: A tuple of dicts (vocab_to_int, int_to_vocab) """
return (None, None)
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_create_lookup_tables(create_lookup_tables)
def token_lookup(): """ Generate a dict to turn punctuation into a token. :return: Tokenized dictionary where the key is the punctuation and the value is the token """
return None
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_tokenize(token_lookup)
""" DON'T MODIFY ANYTHING IN THIS CELL """
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)
""" DON'T MODIFY ANYTHING IN THIS CELL """ import helper import problem_unittests as tests
int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
""" DON'T MODIFY ANYTHING IN THIS CELL """ import torch
train_on_gpu = torch.cuda.is_available() if not train_on_gpu: print('No GPU found. Please use a GPU to train your neural network.')
from torch.utils.data import TensorDataset, DataLoader
def batch_data(words, sequence_length, batch_size): """ Batch the neural network data using DataLoader :param words: The word ids of the TV scripts :param sequence_length: The sequence length of each batch :param batch_size: The size of each batch; the number of sequences in a batch :return: DataLoader with batched data """
return None
test_text = range(50) t_loader = batch_data(test_text, sequence_length=5, batch_size=10)
data_iter = iter(t_loader) sample_x, sample_y = data_iter.next()
print(sample_x.shape) print(sample_x) print() print(sample_y.shape) print(sample_y)
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5): """ Initialize the PyTorch RNN Module :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary) :param output_size: The number of output dimensions of the neural network :param embedding_dim: The size of embeddings, should you choose to use them :param hidden_dim: The size of the hidden layer outputs :param dropout: dropout to add in between LSTM/GRU layers """ super(RNN, self).__init__()
def forward(self, nn_input, hidden): """ Forward propagation of the neural network :param nn_input: The input to the neural network :param hidden: The hidden state :return: Two Tensors, the output of the neural network and the latest hidden state """
return None, None
def init_hidden(self, batch_size): ''' Initialize the hidden state of an LSTM/GRU :param batch_size: The batch_size of the hidden state :return: hidden state of dims (n_layers, batch_size, hidden_dim) '''
return None
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_rnn(RNN, train_on_gpu)
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden): """ Forward and backward propagation on the neural network :param decoder: The PyTorch Module that holds the neural network :param decoder_optimizer: The PyTorch optimizer for the neural network :param criterion: The PyTorch loss function :param inp: A batch of input to the neural network :param target: The target output for the batch of input :return: The loss and the latest hidden state Tensor """
return None, None
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_forward_back_prop(RNN, forward_back_prop, train_on_gpu)
""" DON'T MODIFY ANYTHING IN THIS CELL """
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100): batch_losses = []
rnn.train()
print("Training for %d epoch(s)…" % n_epochs) for epoch_i in range(1, n_epochs + 1):
hidden = rnn.init_hidden(batch_size)
for batch_i, (inputs, labels) in enumerate(train_loader, 1):
n_batches = len(train_loader.dataset)//batch_size if(batch_i > n_batches): break
loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)
batch_losses.append(loss)
if batch_i % show_every_n_batches == 0: print('Epoch: {:>4}/{:<4} Loss: {}\n'.format( epoch_i, n_epochs, np.average(batch_losses))) batch_losses = []
return rnn
sequence_length = # of words in a sequence
batch_size =
train_loader = batch_data(int_text, sequence_length, batch_size)
num_epochs =
learning_rate =
vocab_size =
output_size =
embedding_dim =
hidden_dim =
n_layers =
show_every_n_batches = 500
""" DON'T MODIFY ANYTHING IN THIS CELL """
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5) if train_on_gpu: rnn.cuda()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) criterion = nn.CrossEntropyLoss()
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)
helper.save_model('./save/trained_rnn', trained_rnn) print('Model Trained and Saved')
""" DON'T MODIFY ANYTHING IN THIS CELL """ import torch import helper import problem_unittests as tests
_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess() trained_rnn = helper.load_model('./save/trained_rnn')
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ import torch.nn.functional as F
def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100): """ Generate text using the neural network :param decoder: The PyTorch Module that holds the trained neural network :param prime_id: The word id to start the first prediction :param int_to_vocab: Dict of word id keys to word values :param token_dict: Dict of puncuation tokens keys to puncuation values :param pad_value: The value used to pad a sequence :param predict_len: The length of text to generate :return: The generated text """ rnn.eval()
current_seq = np.full((1, sequence_length), pad_value) current_seq[-1][-1] = prime_id predicted = [int_to_vocab[prime_id]]
for _ in range(predict_len): if train_on_gpu: current_seq = torch.LongTensor(current_seq).cuda() else: current_seq = torch.LongTensor(current_seq)
hidden = rnn.init_hidden(current_seq.size(0))
output, _ = rnn(current_seq, hidden)
p = F.softmax(output, dim=1).data if(train_on_gpu): p = p.cpu() # move to cpu
top_k = 5 p, top_i = p.topk(top_k) top_i = top_i.numpy().squeeze()
p = p.numpy().squeeze() word_i = np.random.choice(top_i, p=p/p.sum())
word = int_to_vocab[word_i] predicted.append(word)
current_seq = np.roll(current_seq, -1, 1) current_seq[-1][-1] = word_i
gen_sentences = ' '.join(predicted)
for key, token in token_dict.items(): ending = ' ' if key in ['\n', '(', '"'] else '' gen_sentences = gen_sentences.replace(' ' + token.lower(), key) gen_sentences = gen_sentences.replace('\n ', '\n') gen_sentences = gen_sentences.replace('( ', '(')
return gen_sentences
gen_length = 400 # modify the length to your preference prime_word = 'jerry' # name for starting the script
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ pad_word = helper.SPECIAL_WORDS['PADDING'] generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length) print(generated_script)
f = open("generated_script_1.txt","w") f.write(generated_script) f.close()



















